###############################################################
###############################################################
#### Stefan Müller, Samuel Brazys, and Alexander Dukalskis
#### Replication Scripts for: 
#### Discourse Wars and 'Mask Diplomacy': China's Global Image Management in Times of Crisis 
#### Political Research Exchange, 2024
#### Link to paper: https://doi.org/10.1080/2474736X.2024.2309178
###############################################################
###############################################################

## Note: check the 000_README.pdf file on Harvard Dataverse for 
## the full replication instructions and information on all code scripts.
## Link to Dataverse repository: https://doi.org/10.7910/DVN/KRXMXJ
## Please contact the authors if you have any questions or suggestions. 
## Note: due to copyright restrictions some of the files cannot be shared publicly.
## However, we provide all replication scripts and intermediate objects to reproduce
## the plots and tables included in the paper and Supporting Information.

## This file returns the most positive and most negative statements
## and aggregates the statements to the level of country-week observations

# load packages
library(tidyverse)
library(newsmap)
library(quanteda)
library(scales)
library(stringr)
library(quanteda.textstats)
library(maps)
library(estimatr)
library(dplyr)
library(xtable)
library(modelsummary)
library(countrycode)
library(lubridate)
library(broom)

# load custom ggplot2 scheme
source("function_theme_base.R")

# save seed words for table
dat_seed <- read.csv("data_sentiment_seed.csv")

head(dat_seed)
dat_seed <- dat_seed |> 
    select(Culprit = negative,
           Savior = positive)

# save as Table A01
x_seed <- xtable(dat_seed)
print(x_seed, type = "html", include.rownames = FALSE,
      file="tab_a01.html")


# load all classified articles
dat_class_raw <- readRDS("data_dontshare/data_texts_lss.rds")

nrow(dat_class_raw)

dat_class_raw <- select(dat_class_raw, -contains("_merge"))

# minimum and maximum dates
min(dat_class_raw$date, na.rm = TRUE)
max(dat_class_raw$date, na.rm = TRUE)

# remove articles that don't have details on country
dat_class_raw <- dat_class_raw |> 
    filter(!is.na(country_predict))

# count number of articles per outlet
dat_class_textsonly <- dat_class_raw |> 
    group_by(domain, country_predict) |> 
    mutate(n_articles_domain = n()) |> 
    ungroup()

# only keep articles with 50 texts per domain
dat_class_textsonly <- dat_class_textsonly |> 
    filter(n_articles_domain >= 50)


summary(dat_class_textsonly$n_articles_domain)

# only keep articles with 50 or statements
# change country to uppercase
dat_class_textsonly <- dat_class_textsonly |> 
    mutate(country_iso2c = str_to_upper(country_predict))

table(dat_class_textsonly$country_iso2c,
      useNA = "always")

# merge metadata

# load VDEM democracy and media freedom scores

head(dat_class_raw$country)

dat_vdem <- read_csv("data_vdem_2019.csv")

# get binary indicator that tests whether a country is 
# above or below the median of each score

dat_vdem <- dat_vdem |> 
    ungroup() |> 
    mutate(vdem_above_median_censor_media = ifelse(vdem_censor_media > median(vdem_censor_media, na.rm = TRUE), 1, 0)) |> 
    mutate(vdem_above_median_censor_internet = ifelse(vdem_censor_internet > median(vdem_censor_internet, na.rm = TRUE), 1, 0)) |> 
    mutate(vdem_above_median_liberal_democracy = ifelse(vdem_liberal_democracy > median(vdem_liberal_democracy, na.rm = TRUE), 1, 0)) |> 
    mutate(vdem_above_median_electoral_democracy = ifelse(vdem_electoral_democracy > median(vdem_electoral_democracy, na.rm = TRUE), 1, 0))



nrow(dat_vdem)
length(unique(dat_vdem$country_iso2c))

dat_class_merged <- dat_class_textsonly |> 
    left_join(dat_vdem, by = "country_iso2c")

# merging worked (same N)
nrow(dat_class_textsonly)
nrow(dat_class_merged)

# load voting data
dat_voting <- read_csv("data_un_agreement.csv")

names(dat_voting)

dat_voting <- dat_voting |> 
    ungroup() |> 
    mutate(un_agree_above_median = ifelse(un_agree > median(un_agree, na.rm = TRUE), 1, 0))


nrow(dat_voting)
length(unique(dat_voting$country_iso2c))

dat_class_merged2 <- left_join(dat_class_merged, 
                               dat_voting)

# merging worked (no duplicates)
nrow(dat_class_merged2)
nrow(dat_class_merged)

# load trade (export) data
dat_exports <- read_csv("data_export_china_2019.csv")

nrow(dat_exports)
length(unique(dat_exports$country_iso2c))

dat_exports <- dat_exports |> 
    ungroup() |> 
    mutate(export_china_above_median = ifelse(export_china_prop > median(export_china_prop, na.rm = TRUE), 1, 0))

dat_class <- left_join(dat_class_merged2, dat_exports)


# number of observations identical!
stopifnot(nrow(dat_class) == nrow(dat_class_merged2))

nrow(dat_class) 

# get count of domains per country
dat_country_count <- dat_class |>
    filter(type == "Not translated") |> 
    group_by(country, country_predict, domain) |>
    count() |> 
    group_by(country, country_predict) |> 
    arrange(-n) |> 
    mutate(rank = 1:n()) |> 
    filter(rank <= 3) |> 
    arrange(country_predict, -n) |> 
    mutate(domain_count = paste0(domain, " (", n, ")")) |> 
    group_by(country) |> 
    summarise(`Top Domains` = paste(domain_count, collapse = "; "))


nrow(dat_country_count)

# map with English texts and exclude countries not
# considered in any regression analysis
dat_country_count <- dat_class |>
    filter(type == "Not translated") |> 
    filter(!country %in% c(
        "Croatia",
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    group_by(country, country_predict) |>
    count() |>
    filter(!is.na(country_predict)) |> 
    ungroup() |> 
    select(-country)

# only the 100 relevant countries
length(unique(dat_country_count$country_predict))

nrow(dat_country_count)

sum(dat_country_count$n)

head(dat_country_count)

colnames(dat_country_count) <- c("id", "frequency")

dat_country_count$id <- str_to_upper(dat_country_count$id)

world_map <- map_data(map = "world")
world_map$region <- iso.alpha(world_map$region) # convert country name to ISO code

# store relevant objects

save(world_map, dat_country_count, 
     file = "data_fig_01.Rdata")

# load objects from replication archive
load("data_fig_01.Rdata")

ggplot(dat_country_count, aes(map_id = id)) +
    geom_map(dat = world_map, map = world_map,
             aes(map_id = region), fill = "white", 
             color = "grey50", linewidth = 0.2) +
    geom_map(aes(fill = frequency),color = "grey50", linewidth = 0.2,
             map = world_map) +
    expand_limits(x = world_map$long, y = world_map$lat) +
    scale_fill_gradient(low = "grey90", high = "grey10",
                        name = "Number of Texts",
                        labels = scales::comma_format(),
                        breaks = c(seq(0, 500000, 50000))) +
    theme_void() +
    coord_fixed() +
    theme(legend.position = "bottom",
          legend.title = element_text(vjust = 0.7),
          plot.background = element_rect(fill = 'white', colour = 'white'),
          panel.background = element_rect(fill = 'white', colour = 'white'),
          legend.key.width = unit(2.5, "cm")) 
ggsave("fig_01.png",
       width = 9, height = 6,
       dpi = 300)
ggsave("fig_01.eps",
       width = 9, height = 6,
       device = "eps")
ggsave("fig_01.pdf",
       width = 9, height = 6)


# number of English statements

sum(dat_country_count$frequency)
# 1,536,013

# number of not translated statements
dat_class |> 
    filter(type == "Not translated") |> 
    filter(!country %in% c(
        "Croatia",
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    nrow()


# number of  translated statements
dat_class |> 
    filter(type == "Translated") |> 
    filter(!country %in% c(
        "Croatia",
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    nrow()

# number of domains (only English texts)
dat_class |> 
    filter(type == "Not translated") |> 
    filter(!country %in% c(
        "Croatia",
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    select(domain) |> 
    unique() |> 
    nrow()
# [1] 1583 domains (English)

nrow(dat_class)

# number of unique documents
dat_class |> 
    select(docname) |> 
    unique() |> 
    nrow()
# 498635

# number of unique documents (English)
dat_class |> 
    filter(type == "Not translated") |> 
    filter(!country %in% c( # countries not included in did analysis
        "Croatia",
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    select(docname) |> 
    unique() |> 
    nrow()
# [1] 387012


#  number of countries (English only)
dat_class |> 
    filter(type == "Not translated") |> 
    filter(!country %in% c( # countries not included in did analysis
        "Croatia",
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    select(country_predict) |> 
    unique() |> 
    nrow()
# 100


dat_class |> 
    filter(type == "Translated") |> 
    select(country_predict) |> 
    unique() |> 
    nrow()
# 101

# all countries and languages
dat_class |> 
    select(country_predict) |> 
    unique() |> 
    nrow()
# 128



# get articles with difference in 0.4 sds

dat_examples <- dat_class |> 
    select(fit, type, date, text,  country_predict,
           docname) 

dat_examples <- dat_examples |> 
    group_by(country_predict) |> 
    mutate(fit_stand = (fit - mean(fit, na.rm = TRUE)) / sd(fit, na.rm = TRUE)) |> 
    select(-fit)

summary(dat_examples$fit_stand)
sd(dat_examples$fit_stand, na.rm = TRUE)

#set.seed(134)
dat_examples_can <- dat_examples |> 
    filter(country_predict == "ca") |> # examples from Canada
    arrange(-fit_stand) |> 
    mutate(date = as.character(date)) |> 
    select(fit_stand, everything())

# get scores reported in paper

dat_examples_can |> 
    filter(str_detect(text, "declared on Thursday that the coronavirus epidemic in China now constitutes a public health emergency of international concern")) |> 
    filter(between(fit_stand, -0.1, 0.1))
# 0.0627

dat_examples_can |> 
    filter(str_detect(text, "Videos showing residents at the centre")) |> 
    filter(fit_stand > -0.08)
#  -0.0772 

dat_examples_can |> 
    filter(str_detect(text, "identifying people with infections and rapidly isolating them"))

dat_examples_can |> 
    filter(str_detect(text, "The city at the centre of China's virus outbreak was reopening for business Monday after authorities lifted more of the controls that locked downs tens of millions of people for two months"))


dat_examples_can |> 
    filter(str_detect(text, "divert attention to its renewed efforts to slow the cor")) |> 
    filter(docname == "https://www.thestar.com/news/world/europe/2020/05/18/china-to-provide-2-billion-to-help-with-virus-pandemic.html")
# 0.396    

## get weekly scores 

table(dat_class$country_predict, useNA = "always")

dat_class <- dat_class |> 
    mutate(china_dummy = ifelse(country_predict == "cn", 
                                "Chinese Sources", "Other Countries"))

table(dat_class$china_dummy)

dat_china_other <- dat_class |> 
    filter(type == "Not translated") |> # focus only on English texts
    filter(!country %in% c( # remove countries not included in did analysis
        "Croatia",
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |>
    filter(!is.na(country_predict)) |> 
    group_by(date, china_dummy) |> 
    summarise(mean_score = mean(fit, na.rm = TRUE))

# save object to create plot
save(dat_china_other, file = "data_fig_04.Rdata")

# load object
load("data_fig_04.Rdata")

# Figure 04 
ggplot(dat_china_other, aes(x = date, y = mean_score,
                            colour = china_dummy, 
                            linetype = china_dummy,
                            shape = china_dummy)) + 
    geom_jitter(size = 2) + 
    scale_linetype_manual(values = c(2, 1)) +
    geom_smooth(n = 1000, method = "gam", alpha = 1,
                fill = "grey80") +
    scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
    annotate("text", x = as.Date("2020-03-01"), 
             y = 2, label = "Chinese Sources",
             colour = "darkred", size = 4.5) +
    annotate("text", x = as.Date("2020-03-01"), 
             y = -0.8, label = "Other Countries",
             colour = "black", size = 4.5) +
    scale_y_continuous(limits = c(-2, 4)) +
    scale_colour_manual(values = c("darkred", "black")) +
    labs(x = "Date", y = "Average Daily Media Tone") +
    theme(legend.position = "none")
ggsave("fig_04.pdf", 
       width = 9, height = 5)
ggsave("fig_04.png", 
       width = 9, height = 5, dpi = 300)
ggsave("fig_04.eps", 
       width = 9, height = 5, device = "eps")

# run regression model
dat_reg_diff_en <- dat_class |> 
    filter(type == "Not translated") |> # focus only on English texts
    filter(!country %in% c(
        "Croatia", 
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    mutate(week = paste(lubridate::week(date),
                        lubridate::year(date),
                        sep = "_")) |> 
    group_by(china_dummy, country_predict, week) |> 
    summarise(mean_score = mean(fit, na.rm = TRUE))


dat_reg_diff_all <- dat_class |> 
    filter(!is.na(country_predict)) |> 
    filter(!country %in% c(
        "Croatia", 
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    mutate(week = paste(lubridate::week(date),
                        lubridate::year(date),
                        sep = "_")) |> 
    group_by(china_dummy, country_predict, week) |> 
    summarise(mean_score = mean(fit, na.rm = TRUE))


dat_reg_diff_all$china_dummy <- forcats::fct_rev(dat_reg_diff_all$china_dummy)
dat_reg_diff_en$china_dummy <- forcats::fct_rev(dat_reg_diff_en$china_dummy)

# predict average score for Chinese sources vs other countries
lm_en <- lm_robust(mean_score~ china_dummy + factor(week),                
                   clusters = country_predict, 
                   data = dat_reg_diff_en)
summary(lm_en)

# divide coefficient by standard deviation of score
0.42 / sd(dat_reg_diff_all$mean_score)

models_main <- list()
models_main[['(1) English texts']] <- lm_en

coefnames <- c("(Intercept)" = "(Intercept)",
               "china_dummyChinese Sources" = "Chinese Sources")


# Table A02
modelsummary(lm_en,
             statistic = "std.error",
             fmt = "%.3f",
             stars = c('*' = .05, '**' = .01, '***' = .001),
             coef_omit = "(week*)",
             coef_rename = coefnames,
             gof_omit = "se_type",
             output = "tab_a02.docx")


# aggregate data by week
dat_weeks_count <- dat_class |> 
    mutate(week_id = floor_date(date, unit = "weeks",
                                week_start = 1)) |> 
    filter(!is.na(country_predict)) |> 
    filter(country_predict != "cn") |> 
    group_by(week_id) |> 
    count()

sum(dat_weeks_count$n)

# count number of "mask diplomacy" terms
dat_dict_terms <- read.csv("dictionary_terms_base.csv",
                           encoding = "utf-8")


dat_dict_terms$word <- paste0(dat_dict_terms$Term.trunks, "*")
dat_dict_terms$sentiment <- paste0(dat_dict_terms$Term.trunks, "*")

# do not select China, Corona and Covid as "mask diplomacy" relevant terms
dat_dict_terms <- dat_dict_terms |> 
    filter(!word %in% c("China*", "Corona*", "Covid*"))

dict_keywords <- as.dictionary(dat_dict_terms)

# apply dictionary to all statements and count number of unique mentions

corp_relevant <- corpus(dat_class)

dfmat_lookedup <- corp_relevant |> 
    tokens(remove_punct = TRUE, remove_numbers = TRUE) |> 
    tokens_lookup(dictionary = dict_keywords) |> 
    dfm() |> 
    dfm_weight(scheme = "boolean") 


tstat_freq_md <- textstat_frequency(dfmat_lookedup, 
                                    groups = type,
                                    n = 30)

tstat_freq_md <- tstat_freq_md |> 
    mutate(group_plot = paste("Texts:", group, sep = " "))

# convert to data frame
dat_lookedup <- quanteda::convert(dfmat_lookedup, to = "data.frame")

# get row sums of aid terms
dat_class$sum_boolean_aid <- rowSums(dplyr::select(dat_lookedup, -doc_id))

# will be used to filter non-mask diplomacy content
table(dat_class$sum_boolean_aid)

# apply newsmap classifier to get continent
# based on country coding
corp_countries <- corpus(dat_class, text_field = "country")

toks_countries <- tokens(corp_countries)

# apply newsmap dictionary
toks_label <- tokens_lookup(toks_countries, 
                            dictionary = newsmap::data_dictionary_newsmap_en, 
                            levels = 1, 
                            nested_scope = "dictionary")

dfmt_label <- dfm(toks_label)

dfmt_feat <- dfm(toks_countries, tolower = FALSE)

dfmt_feat <- dfm_select(dfmt_feat, selection = "keep", 
                        '^[A-Z][A-Za-z1-2]+', 
                        valuetype = 'regex',
                        case_insensitive = FALSE) # include only proper nouns to model


model_continent <- textmodel_newsmap(dfmt_feat, dfmt_label)

# predict continent
dat_class$continent <- predict(model_continent)

# # recode america to americas
dat_class <- dat_class |>
    mutate(continent = dplyr::recode(continent, "america" = "americas"))

table(dat_class$continent)

# get iso3 country code to merge data with COVID cases
dat_class$iso3 <- countrycode(dat_class$country_predict, 
                              origin = "iso2c",
                              destination = "iso3c")


dat_class <- dat_class |> 
    mutate(week = lubridate::week(date) -1,
           year = lubridate::year(date)) |> 
    mutate(week_year = paste(week,
                             year, 
                             sep = "_"))


# aggregate scores to level of weeks
names(dat_class)

# consider all countries and English + translated!
dat_scores_country_week <- dat_class |> 
    mutate(week_id = floor_date(date, unit = "weeks",
                                week_start = 1)) |> 
    group_by(continent, iso3, country, week_id,
             vdem_censor_internet, vdem_censor_media, 
             vdem_above_median_censor_media,
             vdem_above_median_censor_internet,
             vdem_above_median_liberal_democracy,
             vdem_above_median_electoral_democracy,
             vdem_electoral_democracy, vdem_liberal_democracy,
             export_china_prop, export_world, export_china,
             export_china_above_median,
             un_agree, un_agree_above_median) |> 
    summarise(date_week_news_start = min(date),
              date_week_news_end = max(date),
              mean_score_after_reporting_cases = mean(fit, na.rm = TRUE),
              n_texts_week_full = n()) |> 
    ungroup() 

table(dat_class$sum_boolean_aid)

# exclude mask diplomacy related terms
terms_mask_dipl <- 0
dat_scores_no_maskdiplomacy <- dat_class |> 
    filter(sum_boolean_aid == terms_mask_dipl)

(nrow(dat_class) - nrow(dat_scores_no_maskdiplomacy)) / nrow(dat_class)
# reduction by 16 percent

dat_scores_country_week_no_maskdiplomacy <- dat_scores_no_maskdiplomacy |> 
    mutate(week_id = floor_date(date, unit = "weeks",
                                week_start = 1)) |> 
    group_by(continent, iso3, country, week_id) |> 
    summarise(mean_score_after_reporting_cases_no_maskdipl = mean(fit, na.rm = TRUE),
              n_texts_week_no_maskdipl = n()) |> 
    ungroup() 


dat_scores_country_week_no_maskdiplomacy_no_translated <- dat_scores_no_maskdiplomacy |> 
    filter(sum_boolean_aid == terms_mask_dipl) |> 
    filter(type == "Not translated") |>  # keep only non-translated texts
    mutate(week_id = floor_date(date, unit = "weeks",
                                week_start = 1)) |> 
    group_by(continent, iso3, country, week_id) |> 
    summarise(mean_score_after_reporting_cases_no_maskdipl_no_translated = mean(fit, na.rm = TRUE),
              n_texts_week_no_maskdipl_no_translated = n()) |> 
    ungroup() 

table(dat_class$type)

## only consider english articles
dat_scores_country_week_no_translated <- dat_class |> 
    filter(type == "Not translated") |> 
    mutate(week_id = floor_date(date, unit = "weeks",
                                week_start = 1)) |> 
    group_by(continent, iso3, country, week_id) |> 
    summarise(mean_score_after_reporting_cases_no_translated = mean(fit, na.rm = TRUE),
              n_texts_week_no_translated = n()) |> 
    ungroup() 

# merge all datasets
dat_scores_country_week_merged <- left_join(
    dat_scores_country_week, dat_scores_country_week_no_maskdiplomacy) |> 
    left_join(dat_scores_country_week_no_maskdiplomacy_no_translated) |> 
    left_join(dat_scores_country_week_no_translated) 



cor.test(dat_scores_country_week_merged$mean_score_after_reporting_cases,
         dat_scores_country_week_merged$mean_score_after_reporting_cases_no_maskdipl)


dat_scores_country_week_merged$data_available <- 1


#  load data with cases
dat_cases <- read.csv("data_cases_ecdc.csv", 
                      na.strings = "", fileEncoding = "UTF-8-BOM")


# clean dates and construct Week ID
dat_cases <- dat_cases |> 
    mutate(date_reporting_cases = as.Date(dateRep, "%d/%m/%Y")) |> 
    mutate(week = lubridate::week(date_reporting_cases) -1,
           year = lubridate::year(date_reporting_cases)) |> 
    mutate(week_id = floor_date(date_reporting_cases, unit = "weeks",
                                week_start = 1))


# select only relevant variables and rename
dat_cases_select <- dat_cases |> 
    select(year, date_reporting_cases, 
           cases_weekly,
           week_id,
           deaths_weekly, 
           iso3 = countryterritoryCode,
           population_2019 = popData2019,
           notif_rate_per_100000_14_days = notification_rate_per_100000_population_14.days)


# get cases per capita on given day
dat_cases_select <- dat_cases_select |> 
    mutate(cases_weekly_per_million = (cases_weekly / population_2019) * 1e6) |> 
    mutate(deaths_weekly_per_million = (deaths_weekly / population_2019) * 1e6) 


# merge cases with country-week scores
dat_merged_country_weeks <- left_join(dat_cases_select, dat_scores_country_week_merged,
                                      by = c("iso3", "week_id"))


dat_merged_country_weeks <- dat_merged_country_weeks |> 
    mutate(diff_days_news_reporting = date_week_news_end- date_reporting_cases) |> 
    arrange(iso3, week_id) |> 
    mutate(country_week = paste(iso3, week_id, sep = "-")) |> 
    group_by(iso3) |> 
    mutate(n_weeks_data = sum(data_available, na.rm = TRUE)) |> 
    filter(n_weeks_data > 0) |> 
    group_by(iso3) |> 
    mutate(mean_score_sameweek = lag(mean_score_after_reporting_cases)) |> 
    mutate(mean_score_sameweek_no_maskdip = lag(mean_score_after_reporting_cases_no_maskdipl)) |>
    mutate(mean_score_sameweek_no_maskdip_no_translated = lag(mean_score_after_reporting_cases_no_maskdipl_no_translated)) |>
    mutate(mean_score_sameweek_no_translated = lag(mean_score_after_reporting_cases_no_translated)) |>
    mutate(date_week_news_start_lag = lag(date_week_news_start)) |> 
    mutate(date_week_news_end_lag = lag(date_week_news_end)) |> 
    mutate(diff_days_news_reporting_lag = date_week_news_end_lag - date_reporting_cases) |> 
    mutate(week = week(week_id),
           week_year = paste(week(week_id), year(week_id), sep = "_")) |> 
    select(year, country, iso3, week_id, week, week_year,
           date_reporting_cases,
           starts_with("date_week"),
           mean_score_after_reporting_cases, 
           mean_score_after_reporting_cases_no_maskdipl,
           mean_score_after_reporting_cases_no_maskdipl_no_translated,
           mean_score_after_reporting_cases_no_translated,
           mean_score_sameweek, 
           mean_score_sameweek_no_maskdip, 
           mean_score_sameweek_no_maskdip_no_translated,
           mean_score_sameweek_no_translated,
           everything()) 


# plot tone per continent
dat_merged_country_weeks$continent_plot <- str_to_title(dat_merged_country_weeks$continent)


dat_merged_country_weeks <- dat_merged_country_weeks |> 
    filter(!is.na(iso3))
readr::write_csv(dat_merged_country_weeks, "data_news_covid_weeks.csv")


# plot data availability per week and continent
dat_class_na <- filter(dat_class, is.na(continent))

dat_class$continent <- str_to_title(dat_class$continent)

table(dat_class$type)

dat_class_week <- dat_class |> 
    filter(!is.na(continent)) |> 
    filter(type == "Not translated") |> 
    filter(!country %in% c(
        "Croatia", 
        "Hong Kong SAR China",
        "Macao SAR China",
        "Micronesia (Federated States of)",
        "Morocco",
        "North Korea",
        "Pitcairn Islands",
        "Taiwan",
        "Tuvalu")) |> 
    mutate(week_id = floor_date(date, unit = "weeks",
                                week_start = 1)) |> 
    group_by(continent, week_id) |> 
    summarise(n_articles = n()) 


# save data frame
save(dat_class_week, file = "data_fig_02.RData")

# load data frame
load("data_fig_02.RData")

# Figure 02
ggplot(dat_class_week, aes(x = week_id, y = n_articles)) +
    geom_line(colour = "grey50") +
    geom_point(size = 1) +
    facet_wrap(~continent, nrow = 1) +
    scale_x_date(date_labels = "%b %Y", date_breaks = "2 months") +
    scale_y_continuous(labels = scales::comma_format()) +
    labs(x = NULL, y = "Relevant Statements (Per Week)") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
ggsave("fig_02.png", dpi = 300,
       width = 9, height = 4.5)
ggsave("fig_02.pdf",
       width = 9, height = 4.5)
ggsave("fig_02.eps", device = "eps",
       width = 9, height = 4.5)
